In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
data=pd.read_csv('C:/Users/Pankaj Mali/Desktop/Machine learning/Machine learning project/WineQT.csv')
In [3]:
data.head()
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 0
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5 1
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5 2
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6 3
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5 4
In [4]:
data.tail()
Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
1138 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6 1592
1139 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 6 1593
1140 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5 1594
1141 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6 1595
1142 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5 1597
In [5]:
data.shape
Out[5]:
(1143, 13)
In [36]:
data['quality'].unique()
Out[36]:
array([5, 6, 7, 4, 8, 3], dtype=int64)
In [59]:
len12=data['quality'].unique()
len(len12)
Out[59]:
6
In [60]:
data['quality'].value_counts()
Out[60]:
5    483
6    462
7    143
4     33
8     16
3      6
Name: quality, dtype: int64
In [62]:
data.groupby('quality',as_index=False).count()
Out[62]:
quality fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol Id
0 3 6 6 6 6 6 6 6 6 6 6 6 6
1 4 33 33 33 33 33 33 33 33 33 33 33 33
2 5 483 483 483 483 483 483 483 483 483 483 483 483
3 6 462 462 462 462 462 462 462 462 462 462 462 462
4 7 143 143 143 143 143 143 143 143 143 143 143 143
5 8 16 16 16 16 16 16 16 16 16 16 16 16
In [7]:
data.describe().transpose()
Out[7]:
count mean std min 25% 50% 75% max
fixed acidity 1143.0 8.311111 1.747595 4.60000 7.10000 7.90000 9.100000 15.90000
volatile acidity 1143.0 0.531339 0.179633 0.12000 0.39250 0.52000 0.640000 1.58000
citric acid 1143.0 0.268364 0.196686 0.00000 0.09000 0.25000 0.420000 1.00000
residual sugar 1143.0 2.532152 1.355917 0.90000 1.90000 2.20000 2.600000 15.50000
chlorides 1143.0 0.086933 0.047267 0.01200 0.07000 0.07900 0.090000 0.61100
free sulfur dioxide 1143.0 15.615486 10.250486 1.00000 7.00000 13.00000 21.000000 68.00000
total sulfur dioxide 1143.0 45.914698 32.782130 6.00000 21.00000 37.00000 61.000000 289.00000
density 1143.0 0.996730 0.001925 0.99007 0.99557 0.99668 0.997845 1.00369
pH 1143.0 3.311015 0.156664 2.74000 3.20500 3.31000 3.400000 4.01000
sulphates 1143.0 0.657708 0.170399 0.33000 0.55000 0.62000 0.730000 2.00000
alcohol 1143.0 10.442111 1.082196 8.40000 9.50000 10.20000 11.100000 14.90000
quality 1143.0 5.657043 0.805824 3.00000 5.00000 6.00000 6.000000 8.00000
Id 1143.0 804.969379 463.997116 0.00000 411.00000 794.00000 1209.500000 1597.00000
In [8]:
data.isnull().any()
Out[8]:
fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
Id                      False
dtype: bool
In [11]:
max_thresold=data['total sulfur dioxide'].quantile(0.99)
max_thresold
Out[11]:
143.0
In [24]:
data[data['total sulfur dioxide']>max_thresold]
Out[24]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
249 6.1 0.210 0.40 1.4 0.066 40.5 165.0 0.99120 3.25 0.59 11.9 6 354
366 8.5 0.655 0.49 6.1 0.122 34.0 151.0 1.00100 3.31 1.14 9.3 5 515
421 6.6 0.390 0.49 1.7 0.070 23.0 149.0 0.99220 3.12 0.50 11.5 6 591
452 9.6 0.880 0.28 2.4 0.086 30.0 147.0 0.99790 3.24 0.53 9.4 5 636
453 9.5 0.885 0.27 2.3 0.084 31.0 145.0 0.99780 3.24 0.53 9.4 5 637
460 6.7 0.420 0.27 8.6 0.068 24.0 148.0 0.99480 3.16 0.57 11.3 6 649
485 9.8 0.980 0.32 2.3 0.078 35.0 152.0 0.99800 3.25 0.48 9.4 5 684
554 9.5 0.570 0.27 2.3 0.082 23.0 144.0 0.99782 3.27 0.55 9.4 5 772
760 7.9 0.300 0.68 8.3 0.050 37.5 278.0 0.99316 3.01 0.51 12.3 7 1079
761 7.9 0.300 0.68 8.3 0.050 37.5 289.0 0.99316 3.01 0.51 12.3 7 1081
1066 7.7 0.540 0.26 1.9 0.089 23.0 147.0 0.99636 3.26 0.59 9.7 5 1493
In [20]:
min_thresold=data['total sulfur dioxide'].quantile(0.00)
min_thresold
Out[20]:
6.0
In [30]:
data[data['total sulfur dioxide']<min_thresold]
Out[30]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
In [32]:
Newdata=data[(data['total sulfur dioxide']<=max_thresold)&(data['total sulfur dioxide']>=min_thresold)]
Newdata
Out[32]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality Id
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 0
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5 1
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5 2
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6 3
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5 4
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1138 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6 1592
1139 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 6 1593
1140 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5 1594
1141 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6 1595
1142 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5 1597

1132 rows × 13 columns

In [34]:
Newdata['total sulfur dioxide'].min()
Out[34]:
6.0
In [35]:
Newdata['total sulfur dioxide'].max()
Out[35]:
143.0
In [55]:
del Newdata['Id']
In [109]:
Newdata
Out[109]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5
... ... ... ... ... ... ... ... ... ... ... ... ...
1138 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6
1139 6.8 0.620 0.08 1.9 0.068 28.0 38.0 0.99651 3.42 0.82 9.5 6
1140 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5
1141 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6
1142 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5

1132 rows × 12 columns

In [ ]:
 
In [56]:
corr_data=Newdata.corr()
corr_data
Out[56]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
fixed acidity 1.000000 -0.262077 0.680659 0.181421 0.107023 -0.166066 -0.116545 0.683086 -0.692338 0.175774 -0.069622 0.126654
volatile acidity -0.262077 1.000000 -0.547116 0.003978 0.054120 -0.002843 0.092629 0.000819 0.220828 -0.280656 -0.193672 -0.402841
citric acid 0.680659 -0.547116 1.000000 0.164192 0.249674 -0.071173 0.000896 0.390543 -0.543077 0.336818 0.100660 0.237339
residual sugar 0.181421 0.003978 0.164192 1.000000 0.078343 0.151316 0.140406 0.406271 -0.103493 0.018741 0.047316 0.009898
chlorides 0.107023 0.054120 0.249674 0.078343 1.000000 0.019238 0.066558 0.205668 -0.283000 0.373461 -0.227752 -0.121725
free sulfur dioxide -0.166066 -0.002843 -0.071173 0.151316 0.019238 1.000000 0.667573 -0.046270 0.086039 0.040114 -0.051811 -0.066355
total sulfur dioxide -0.116545 0.092629 0.000896 0.140406 0.066558 0.667573 1.000000 0.089157 -0.024444 0.049778 -0.225650 -0.215218
density 0.683086 0.000819 0.390543 0.406271 0.205668 -0.046270 0.089157 1.000000 -0.368900 0.135537 -0.488231 -0.167122
pH -0.692338 0.220828 -0.543077 -0.103493 -0.283000 0.086039 -0.024444 -0.368900 1.000000 -0.193253 0.234363 -0.047672
sulphates 0.175774 -0.280656 0.336818 0.018741 0.373461 0.040114 0.049778 0.135537 -0.193253 1.000000 0.099578 0.263122
alcohol -0.069622 -0.193672 0.100660 0.047316 -0.227752 -0.051811 -0.225650 -0.488231 0.234363 0.099578 1.000000 0.479886
quality 0.126654 -0.402841 0.237339 0.009898 -0.121725 -0.066355 -0.215218 -0.167122 -0.047672 0.263122 0.479886 1.000000
In [57]:
plt.figure(figsize=(10,5))
sns.heatmap(corr_data,annot=True,vmax=1,vmin=-1,cmap='PuBu')
Out[57]:
<AxesSubplot:>
In [63]:
sns.barplot(x='quality',y='alcohol',data=Newdata)
Out[63]:
<AxesSubplot:xlabel='quality', ylabel='alcohol'>

quality_value=Newdata['quality'].values alcohol_value=Newdata['alcohol'].values plt.bar(quality_value,alcohol_value)

In [65]:
sns.barplot(x='quality',y='sulphates',data=Newdata)
Out[65]:
<AxesSubplot:xlabel='quality', ylabel='sulphates'>
In [70]:
sns.barplot(x='quality',y='citric acid',data=Newdata)
Out[70]:
<AxesSubplot:xlabel='quality', ylabel='citric acid'>
In [71]:
sns.barplot(x='quality',y='fixed acidity',data=Newdata)
Out[71]:
<AxesSubplot:xlabel='quality', ylabel='fixed acidity'>
In [73]:
sns.barplot(x='quality',y='residual sugar',data=Newdata)
Out[73]:
<AxesSubplot:xlabel='quality', ylabel='residual sugar'>
In [74]:
sns.barplot(x='quality',y='free sulfur dioxide',data=Newdata)
Out[74]:
<AxesSubplot:xlabel='quality', ylabel='free sulfur dioxide'>
In [75]:
sns.barplot(x='quality',y='total sulfur dioxide',data=Newdata)
Out[75]:
<AxesSubplot:xlabel='quality', ylabel='total sulfur dioxide'>
In [86]:
Newdata[['pH','quality']].groupby('quality',as_index=False).min()
Out[86]:
quality pH
0 3 3.16
1 4 2.74
2 5 2.88
3 6 2.86
4 7 2.92
5 8 2.88
In [87]:
Newdata[['pH','quality']].groupby('quality',as_index=False).max()
Out[87]:
quality pH
0 3 3.55
1 4 3.90
2 5 3.74
3 6 4.01
4 7 3.78
5 8 3.72
In [89]:
sns.boxplot(Newdata['quality'],Newdata['pH'])
Out[89]:
<AxesSubplot:xlabel='quality', ylabel='pH'>
In [94]:
sns.pairplot(Newdata,hue='quality')
Out[94]:
<seaborn.axisgrid.PairGrid at 0x1b38db68520>
In [131]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
In [176]:
scaler=StandardScaler()
x1=Newdata.drop('quality',axis=1)
y1=Newdata['quality']

scaler.fit(x1)
x_transformed=scaler.transform(x1)
In [268]:
x_train,x_test,y_train,y_test=train_test_split(x_transformed,y1,test_size=0.28,random_state=80)
In [269]:
model=SVC()
model.fit(x_train,y_train)
y_train_pred=model.predict(x_train)
y_test_predict=model.predict(x_test)
In [271]:
print('Train set accuracy :' +str(accuracy_score(y_train_pred,y_train)*100))
print('Test set accuracy :' +str(accuracy_score(y_test_predict,y_test)*100))
print('\nConfusion Matrix :\n%s'%confusion_matrix(y_test_predict,y_test))
print('\nClassificationReport : \n%s'%classification_report(y_test_predict,y_test))
Train set accuracy :68.09815950920245
Test set accuracy :63.722397476340696

Confusion Matrix :
[[  0   0   0   0   0   0]
 [  0   0   0   0   0   0]
 [  4   8 103  34   3   1]
 [  0   1  29  84  24   3]
 [  0   0   0   7  15   1]
 [  0   0   0   0   0   0]]

ClassificationReport : 
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.78      0.67      0.72       153
           6       0.67      0.60      0.63       141
           7       0.36      0.65      0.46        23
           8       0.00      0.00      0.00         0

    accuracy                           0.64       317
   macro avg       0.30      0.32      0.30       317
weighted avg       0.70      0.64      0.66       317

2nd method of syntax-¶

In [272]:
accuracy_train=model.score(x_train,y_train)
print('Training accuracy',accuracy_train*100,'%')
Training accuracy 68.09815950920245 %
In [273]:
accuracy_test=model.score(x_test,y_test)
print('Testing accuracy',accuracy_test*100,'%')
Testing accuracy 63.722397476340696 %
In [274]:
print(classification_report(y_test_predict,y_test))
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.78      0.67      0.72       153
           6       0.67      0.60      0.63       141
           7       0.36      0.65      0.46        23
           8       0.00      0.00      0.00         0

    accuracy                           0.64       317
   macro avg       0.30      0.32      0.30       317
weighted avg       0.70      0.64      0.66       317

In [275]:
print(confusion_matrix(y_test_predict,y_test))
[[  0   0   0   0   0   0]
 [  0   0   0   0   0   0]
 [  4   8 103  34   3   1]
 [  0   1  29  84  24   3]
 [  0   0   0   7  15   1]
 [  0   0   0   0   0   0]]

KNN¶

In [284]:
from sklearn.neighbors import KNeighborsClassifier
In [285]:
features=Newdata.drop('quality',axis=1)
target=Newdata['quality']
In [286]:
scale=StandardScaler()
In [287]:
scale.fit(features)
Out[287]:
StandardScaler()
In [288]:
scaled_features=scale.transform(features)
In [289]:
data_new=pd.DataFrame(scaled_features)
data_new.head(3)
Out[289]:
0 1 2 3 4 5 6 7 8 9 10
0 -0.521217 0.945317 -1.359004 -0.466051 -0.232585 -0.439504 -0.356276 0.555186 1.263220 -0.578131 -0.964069
1 -0.292749 1.952680 -1.359004 0.063144 0.231281 0.937574 0.745695 0.031522 -0.716191 0.127216 -0.593699
2 -0.292749 1.281105 -1.155466 -0.163654 0.104772 -0.046053 0.311585 0.136254 -0.333079 -0.049121 -0.593699
In [290]:
x_train,x_test,y_train,y_test=train_test_split(data_new,target,test_size=0.25,random_state=45)
In [505]:
model=KNeighborsClassifier(n_neighbors=54)
model.fit(x_train,y_train)
Out[505]:
KNeighborsClassifier(n_neighbors=54)
In [506]:
pred=model.predict(x_test)
pred
Out[506]:
array([5, 7, 6, 6, 6, 6, 5, 6, 6, 6, 6, 5, 5, 6, 6, 6, 5, 5, 5, 6, 5, 7,
       6, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 7, 6, 7, 6, 6, 5, 6, 6,
       5, 5, 5, 5, 6, 5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 6, 6, 6, 5, 7, 6, 6,
       6, 6, 6, 7, 5, 6, 5, 6, 5, 6, 5, 5, 5, 6, 5, 6, 5, 7, 6, 6, 5, 5,
       6, 5, 6, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 5, 5, 5, 6, 5, 5, 6, 6, 5,
       6, 6, 5, 5, 6, 5, 5, 6, 6, 6, 5, 7, 5, 5, 5, 6, 5, 5, 6, 6, 5, 6,
       5, 6, 5, 5, 6, 7, 5, 5, 5, 5, 6, 6, 5, 6, 5, 5, 5, 6, 6, 6, 6, 5,
       6, 5, 5, 7, 5, 6, 6, 5, 5, 5, 6, 5, 7, 5, 5, 5, 5, 5, 6, 5, 5, 5,
       6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 7, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, 6,
       6, 6, 6, 5, 6, 7, 5, 6, 6, 5, 6, 5, 5, 5, 5, 5, 6, 6, 5, 5, 5, 6,
       6, 5, 5, 6, 6, 6, 5, 5, 6, 5, 6, 6, 5, 6, 5, 6, 6, 5, 5, 5, 6, 6,
       5, 7, 6, 5, 6, 6, 5, 6, 6, 5, 6, 5, 5, 7, 7, 5, 5, 7, 6, 6, 5, 5,
       5, 6, 6, 5, 6, 6, 6, 6, 5, 5, 5, 6, 6, 6, 5, 5, 6, 6, 5],
      dtype=int64)
In [507]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.73      0.73      0.73       128
           6       0.51      0.67      0.58       106
           7       0.59      0.26      0.36        38
           8       0.00      0.00      0.00         4

    accuracy                           0.62       283
   macro avg       0.31      0.28      0.28       283
weighted avg       0.60      0.62      0.60       283

In [508]:
print(classification_report(y_test,pred))
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         6
           5       0.73      0.73      0.73       128
           6       0.51      0.67      0.58       106
           7       0.59      0.26      0.36        38
           8       0.00      0.00      0.00         4

    accuracy                           0.62       283
   macro avg       0.31      0.28      0.28       283
weighted avg       0.60      0.62      0.60       283

In [509]:
accuracy=model.score(x_test,y_test)
print(accuracy*100,'%')
61.83745583038869 %
In [499]:
#29,54,115,125,
In [ ]: